{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "2ec44ea7-dd0f-4f42-8722-d76637cadb57",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "train_df = pd.read_csv('./酒店住宿价格预测挑战赛公开数据/train.csv')\n",
    "test_df = pd.read_csv('./酒店住宿价格预测挑战赛公开数据/test.csv')\n",
    "train_df['target'] = np.log1p(train_df['target'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64b2c01c-d827-4cca-a1d6-57366d9eb045",
   "metadata": {},
   "source": [
    "| 特征字段                       | 字段描述                      |\n",
    "| ------------------------------ | ----------------------------- |\n",
    "| id                             | 样本标识id                    |\n",
    "| host_id                        | 酒店id                        |\n",
    "| neighbourhood_group            | 街区分组                      |\n",
    "| neighbourhood                  | 街区                          |\n",
    "| room_type                      | 房间类型                      |\n",
    "| minimum_nights                 | 最低夜晚数                    |\n",
    "| number_of_reviews              | 评论数量                      |\n",
    "| last_review                    | 最新评论时间                  |\n",
    "| reviews_per_month              | 每月评论数量                  |\n",
    "| calculated_host_listings_count | 酒店的订单数量                |\n",
    "| availability                   | 未来365天内房间可以预订的天数 |\n",
    "| region_1_id                    | 酒店区域ID1                   |\n",
    "| region_2_id                    | 酒店区域ID2                   |\n",
    "| region_3_id                    | 酒店区域ID3                   |\n",
    "| target                         | 酒店住宿价格（已脱敏处理）    |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "b832eaea-a18d-46f5-a4c9-43e859ab3411",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>host_id</th>\n",
       "      <td>7609</td>\n",
       "      <td>35608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>neighbourhood_group</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>neighbourhood</th>\n",
       "      <td>20</td>\n",
       "      <td>183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>room_type</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>minimum_nights</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>number_of_reviews</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>last_review</th>\n",
       "      <td>2019-03-24</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>reviews_per_month</th>\n",
       "      <td>0.09</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>calculated_host_listings_count</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>availability</th>\n",
       "      <td>7</td>\n",
       "      <td>180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region_1_id</th>\n",
       "      <td>8</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region_2_id</th>\n",
       "      <td>37</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region_3_id</th>\n",
       "      <td>332</td>\n",
       "      <td>339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target</th>\n",
       "      <td>6.306275</td>\n",
       "      <td>4.875197</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         0         1\n",
       "id                                       0         1\n",
       "host_id                               7609     35608\n",
       "neighbourhood_group                      0         2\n",
       "neighbourhood                           20       183\n",
       "room_type                                1         0\n",
       "minimum_nights                           4         1\n",
       "number_of_reviews                        4         0\n",
       "last_review                     2019-03-24       NaN\n",
       "reviews_per_month                     0.09       NaN\n",
       "calculated_host_listings_count           1         1\n",
       "availability                             7       180\n",
       "region_1_id                              8        18\n",
       "region_2_id                             37        28\n",
       "region_3_id                            332       339\n",
       "target                            6.306275  4.875197"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.iloc[:2].T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "0606adca-7e4f-4e90-8188-304ae675d7ae",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     1254\n",
       "1     1577\n",
       "2      119\n",
       "3     1334\n",
       "4     1497\n",
       "5     1022\n",
       "6      324\n",
       "7      192\n",
       "8      749\n",
       "9      593\n",
       "10     968\n",
       "11     301\n",
       "12     907\n",
       "13    1287\n",
       "14    1599\n",
       "15     123\n",
       "16     265\n",
       "17    1156\n",
       "18     265\n",
       "19    2339\n",
       "20    1719\n",
       "21    1202\n",
       "22     324\n",
       "23     631\n",
       "24      44\n",
       "25    2311\n",
       "26    1095\n",
       "27     385\n",
       "28     897\n",
       "29    1809\n",
       "30    1364\n",
       "31     348\n",
       "Name: region_1_id, dtype: int64"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['region_1_id'].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "e115b48b-cdec-4f65-9898-655d2bc91539",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       55\n",
       "1      160\n",
       "2       50\n",
       "3      115\n",
       "4      167\n",
       "      ... \n",
       "123     93\n",
       "124      7\n",
       "125    120\n",
       "126     68\n",
       "127     95\n",
       "Name: region_2_id, Length: 128, dtype: int64"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df['region_2_id'].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "36a60ca9-5391-4799-a4c2-07515d252d8b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                               -0.004852\n",
       "host_id                          -0.009706\n",
       "neighbourhood_group              -0.020181\n",
       "neighbourhood                    -0.157885\n",
       "room_type                         0.468608\n",
       "minimum_nights                    0.033968\n",
       "number_of_reviews                -0.046053\n",
       "reviews_per_month                -0.045008\n",
       "calculated_host_listings_count    0.140425\n",
       "availability                      0.100097\n",
       "region_1_id                       0.122547\n",
       "region_2_id                      -0.030460\n",
       "region_3_id                      -0.019899\n",
       "target                            1.000000\n",
       "Name: target, dtype: float64"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.corr().target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "3ef06d23-e3b2-4928-af26-28a48a6324a6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_predict, cross_validate\n",
    "from lightgbm import LGBMRegressor\n",
    "from catboost import CatBoostRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.metrics import mean_absolute_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "7b322185-6e74-404b-a40d-d4ad1736b2a3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_df['last_review_isnull'] = train_df['last_review'].isnull()\n",
    "train_df['last_review_year'] = pd.to_datetime(train_df['last_review']).dt.year\n",
    "\n",
    "train_df['neighbourhood_group_mean'] = train_df['neighbourhood_group'].map(train_df.groupby(['neighbourhood_group'])['target'].mean())\n",
    "train_df['neighbourhood_group_counts'] = train_df['neighbourhood_group'].map(train_df['neighbourhood_group'].value_counts())\n",
    "\n",
    "train_df['room_type_mean'] = train_df['room_type'].map(train_df.groupby(['room_type'])['target'].mean())\n",
    "train_df['room_type_counts'] = train_df['room_type'].map(train_df['room_type'].value_counts())\n",
    "\n",
    "train_df['region_1_id_mean'] = train_df['region_1_id'].map(train_df.groupby(['region_1_id'])['target'].mean())\n",
    "train_df['region_1_counts'] = train_df['region_1_id'].map(train_df['region_1_id'].value_counts())\n",
    "\n",
    "train_df['region_2_id_mean'] = train_df['region_2_id'].map(train_df.groupby(['region_2_id'])['target'].mean())\n",
    "train_df['region_2_counts'] = train_df['region_2_id'].map(train_df['region_2_id'].value_counts())\n",
    "\n",
    "train_df['region_3_id_mean'] = train_df['region_3_id'].map(train_df.groupby(['region_3_id'])['target'].mean())\n",
    "train_df['region_3_counts'] = train_df['region_3_id'].map(train_df['region_3_id'].value_counts())\n",
    "\n",
    "train_df['availability_month'] = train_df['availability'] // 30\n",
    "train_df['availability_week'] = train_df['availability'] // 7\n",
    "\n",
    "train_df['reviews_per_month_count'] = train_df['reviews_per_month'] * train_df['calculated_host_listings_count'] \n",
    "train_df['room_type_calculated_host_listings_count'] = train_df['room_type'].map(train_df.groupby(['room_type'])['calculated_host_listings_count'].sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "aecf0c08-eff6-4e82-8502-a3ffe4243f20",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "test_df['last_review_isnull'] = test_df['last_review'].isnull()\n",
    "test_df['last_review_year'] = pd.to_datetime(test_df['last_review']).dt.year\n",
    "\n",
    "test_df['neighbourhood_group_mean'] = test_df['neighbourhood_group'].map(train_df.groupby(['neighbourhood_group'])['target'].mean())\n",
    "test_df['neighbourhood_group_counts'] = test_df['neighbourhood_group'].map(train_df['neighbourhood_group'].value_counts())\n",
    "\n",
    "test_df['room_type_mean'] = test_df['room_type'].map(train_df.groupby(['room_type'])['target'].mean())\n",
    "test_df['room_type_counts'] = test_df['room_type'].map(train_df['room_type'].value_counts())\n",
    "\n",
    "test_df['region_1_id_mean'] = test_df['region_1_id'].map(train_df.groupby(['region_1_id'])['target'].mean())\n",
    "test_df['region_1_counts'] = test_df['region_1_id'].map(train_df['region_1_id'].value_counts())\n",
    "\n",
    "test_df['region_2_id_mean'] = test_df['region_2_id'].map(train_df.groupby(['region_2_id'])['target'].mean())\n",
    "test_df['region_2_counts'] = test_df['region_2_id'].map(train_df['region_2_id'].value_counts())\n",
    "\n",
    "test_df['region_3_id_mean'] = test_df['region_3_id'].map(train_df.groupby(['region_3_id'])['target'].mean())\n",
    "test_df['region_3_counts'] = test_df['region_3_id'].map(train_df['region_3_id'].value_counts())\n",
    "\n",
    "test_df['availability_month'] = test_df['availability'] // 30\n",
    "test_df['availability_week'] = test_df['availability'] // 7\n",
    "\n",
    "test_df['reviews_per_month_count'] = test_df['reviews_per_month'] * test_df['calculated_host_listings_count'] \n",
    "test_df['room_type_calculated_host_listings_count'] = test_df['room_type'].map(train_df.groupby(['room_type'])['calculated_host_listings_count'].sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "4c21b3ad-8071-43fb-9bb3-f789dfbcf4e0",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "110.4459987795426"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val_pred = cross_val_predict(\n",
    "    # LGBMRegressor(verbose=0, force_row_wise=True),\n",
    "    CatBoostRegressor(verbose=0,n_estimators=1000),\n",
    "    # XGBRegressor(),\n",
    "    train_df.drop(['id', 'target', 'last_review'], axis=1),\n",
    "    train_df['target']\n",
    ")\n",
    "\n",
    "mean_absolute_error(np.exp(val_pred)-1, np.exp(train_df['target'])-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "5c80924a-b645-4ee1-b244-637654a42d87",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "cat_val = cross_validate(\n",
    "    CatBoostRegressor(verbose=0,n_estimators=1000),\n",
    "    train_df.drop(['id', 'target', 'last_review'], axis=1),\n",
    "    train_df['target'],\n",
    "    return_estimator=True\n",
    ")\n",
    "\n",
    "lgb_val = cross_validate(\n",
    "    LGBMRegressor(verbose=0, force_row_wise=True),\n",
    "    train_df.drop(['id', 'target', 'last_review'], axis=1),\n",
    "    train_df['target'],\n",
    "    return_estimator=True\n",
    ")\n",
    "\n",
    "xgb_val = cross_validate(\n",
    "    XGBRegressor(),\n",
    "    train_df.drop(['id', 'target', 'last_review'], axis=1),\n",
    "    train_df['target'],\n",
    "    return_estimator=True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "dbe212ad-d618-45ae-9a05-5c93a4222b68",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pred = np.zeros(len(test_df))\n",
    "# for clf in cat_val['estimator'] + lgb_val['estimator'] + xgb_val['estimator']:\n",
    "for clf in cat_val['estimator']:\n",
    "    pred += clf.predict(test_df.drop(['id', 'last_review'], axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "67508ab0-e427-4c03-8326-44ac1583ab74",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pred /= 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "5043a90b-ec05-44e1-af12-316246570758",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pred = np.exp(pred) - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "209631ad-3307-4c47-9b85-beb2c0a5cd2d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pd.DataFrame({'id': range(30000, 40000), 'target': pred}).to_csv('a.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6fb5c0d-6fb1-4222-b292-590cdeea6d3a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
